En este notebook aplicamos los test estadísticos y el algoritmo decisión multicriterio a aquellos modelos que pensamos que sufren de menos overfitting.
import pandas as pd
from pandas import read_csv
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
import joblib as joblib
import matplotlib.pyplot as plt
import scikit_posthocs as scp
import scipy.stats as stats
import operator
import numpy as np
import statsmodels as st
import pingouin as pg
import math
import sklearn.metrics as sm
from plotly.subplots import make_subplots
import plotly.express as px
import plotly
import plotly.graph_objects as go
import flask
C:\Users\Carmen\anaconda3\lib\site-packages\outdated\utils.py:14: OutdatedPackageWarning: The package outdated is out of date. Your version is 0.2.0, the latest is 0.2.1. Set the environment variable OUTDATED_IGNORE=1 to disable these warnings. return warn(
df_predictor_lag3_escalado = read_csv('../Datos_preprocesados/predictor_lag3_escalado.csv', encoding='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
df_predictor_lag3_escalado = df_predictor_lag3_escalado.set_index('Fecha')
df_predictor_lag5_escalado = read_csv('../Datos_preprocesados/predictor_lag5_escalado.csv', encoding='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
df_predictor_lag5_escalado = df_predictor_lag5_escalado.set_index('Fecha')
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(df_predictor_lag3_escalado.drop(['Incidentes'], axis=1),
df_predictor_lag3_escalado['Incidentes'], train_size = 0.8, test_size = 0.2,
random_state = 42, shuffle = False)
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(df_predictor_lag5_escalado.drop(['Incidentes'], axis=1),
df_predictor_lag5_escalado['Incidentes'], train_size = 0.8, test_size = 0.2,
random_state = 42, shuffle = False)
Cargamos los modelos
modelos = []
modelos.append(joblib.load('rf_lag3_escalado.pkl'))
modelos.append(joblib.load('rf_lag5_escalado.pkl'))
modelos.append(joblib.load('regresion_lineal_lag3_escalado.pkl'))
modelos.append(joblib.load('regresion_lineal_lag5_escalado.pkl'))
modelos.append(joblib.load('lasso_lag3_escalado.pkl'))
modelos.append(joblib.load('lasso_lag5_escalado.pkl'))
modelos.append(joblib.load('svr_lag3_escalado.pkl'))
modelos.append(joblib.load('svr_lag5_escalado.pkl'))
modelos.append(joblib.load('gbr_lag3_escalado.pkl'))
modelos.append(joblib.load('gbr_lag5_escalado.pkl'))
modelos.append(joblib.load('en_lag3_escalado.pkl'))
modelos.append(joblib.load('en_lag5_escalado.pkl'))
nombres = ['RF Lag3', 'RF Lag5', 'LR Lag3', 'LR Lag5', 'Lasso Lag3', 'Lasso Lag5', 'SVR Lag3', 'SVR Lag5', 'GBR Lag3', 'GBR Lag5', 'EN Lag3', 'EN Lag5']
Función que predice el instante t tanto en train como en test para cada modelo y muestra las gráficas
def pred_train_test(modelo, lag):
if lag == 3:
X_train = X_train_3
y_train = y_train_3
X_test = X_test_3
y_test = y_test_3
if lag == 5:
X_train = X_train_5
y_train = y_train_5
X_test = X_test_5
y_test = y_test_5
fig = make_subplots(rows=1, cols=2)
# Predicción en train
y_pred_train_modelo = modelo.predict(X_train)
# Predicción en test
y_pred_test_modelo = modelo.predict(X_test)
fig.add_trace(go.Scatter(x = y_train.index, y = y_pred_train_modelo, mode = 'lines', name = 'Train Real'), row = 1, col = 1)
fig.add_trace(go.Scatter(x = y_train.index, y = y_train, mode = 'lines', name = 'Pred Train'), row = 1, col = 1)
fig.add_trace(go.Scatter(x = y_test.index, y = y_pred_test_modelo, mode = 'lines', name = 'Test Real'), row = 1, col = 2)
fig.add_trace(go.Scatter(x = y_test.index, y = y_test, mode = 'lines', name = 'Pred test'), row = 1, col = 2)
fig.update_layout(showlegend = True, title_text = 'Predicciones Train VS Test ' + str(nombres[i]))
flask.Markup(fig)
fig.show("notebook")
mae_train = sm.mean_absolute_error(y_train, y_pred_train_modelo)
mae_test = sm.mean_absolute_error(y_test, y_pred_test_modelo)
rmse_train = math.sqrt(sm.mean_squared_error(y_train, y_pred_train_modelo))
rmse_test = math.sqrt(sm.mean_squared_error(y_test, y_pred_test_modelo))
return mae_train, mae_test, rmse_train, rmse_test
maes_train = []
maes_test = []
rmses_train = []
rmses_test = []
for i in range(0, len(modelos)):
nombre = nombres[i]
# Si es numero par
if i % 2 == 0:
lag = 3
else:
lag = 5
mae_train, mae_test, rmse_train, rmse_test = pred_train_test(modelos[i], lag)
maes_train.append(mae_train)
maes_test.append(mae_test)
rmses_train.append(rmse_train)
rmses_test.append(rmse_test)